In [38]:
# GENDER AGENDAS MAPPER
# V3 - July 2025
# Developed by the Gender Justice Data Hub
# CC BY-NC-SA 4.0, Global Fund for Women
In [3]:
# Uninstall everything related
!pip uninstall -y torch torchvision torchaudio transformers sentence-transformers bertopic umap-learn hdbscan accelerate bitsandbytes xformers
# Clear pip cache
!pip cache purge
# First install PyTorch with CUDA
!pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124
# Then the base dependencies
!pip install transformers==4.35.2 accelerate bitsandbytes
# Install sentence-transformers before BERTopic
!pip install sentence-transformers
# Finally install BERTopic and its dependencies
!pip install bertopic umap-learn hdbscan adjustText
Found existing installation: torch 2.2.2 Uninstalling torch-2.2.2: Successfully uninstalled torch-2.2.2 WARNING: Skipping torchvision as it is not installed. WARNING: Skipping torchaudio as it is not installed. Found existing installation: transformers 4.53.2 Uninstalling transformers-4.53.2: Successfully uninstalled transformers-4.53.2 Found existing installation: sentence-transformers 5.0.0 Uninstalling sentence-transformers-5.0.0: Successfully uninstalled sentence-transformers-5.0.0 Found existing installation: bertopic 0.17.3 Uninstalling bertopic-0.17.3: Successfully uninstalled bertopic-0.17.3 Found existing installation: umap-learn 0.5.9.post2 Uninstalling umap-learn-0.5.9.post2: Successfully uninstalled umap-learn-0.5.9.post2 Found existing installation: hdbscan 0.8.40 Uninstalling hdbscan-0.8.40: Successfully uninstalled hdbscan-0.8.40 Found existing installation: accelerate 1.8.1 Uninstalling accelerate-1.8.1: Successfully uninstalled accelerate-1.8.1 Found existing installation: bitsandbytes 0.42.0 Uninstalling bitsandbytes-0.42.0: Successfully uninstalled bitsandbytes-0.42.0 WARNING: Skipping xformers as it is not installed. Files removed: 104 Looking in indexes: https://download.pytorch.org/whl/cu124 ERROR: Could not find a version that satisfies the requirement torch==2.6.0 (from versions: none) ERROR: No matching distribution found for torch==2.6.0 Collecting transformers==4.35.2 Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.5/123.5 kB 3.1 MB/s eta 0:00:00 Collecting accelerate Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB) Collecting bitsandbytes Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB) Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (3.13.1) Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (0.33.4) Requirement already satisfied: numpy>=1.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (1.26.4) Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (23.1) Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (6.0.1) Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (2023.10.3) Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (2.32.3) Collecting tokenizers<0.19,>=0.14 (from transformers==4.35.2) Downloading tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl.metadata (6.7 kB) Requirement already satisfied: safetensors>=0.3.1 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (0.5.2) Requirement already satisfied: tqdm>=4.27 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (4.65.0) Requirement already satisfied: psutil in /opt/anaconda3/lib/python3.11/site-packages (from accelerate) (5.9.0) Collecting torch>=2.0.0 (from accelerate) Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl.metadata (25 kB) Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.11/site-packages (from bitsandbytes) (1.11.4) Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->transformers==4.35.2) (2023.6.0) Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->transformers==4.35.2) (4.14.0) Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->transformers==4.35.2) (1.1.5) Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=2.0.0->accelerate) (1.12) Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=2.0.0->accelerate) (3.1) Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=2.0.0->accelerate) (3.1.3) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (3.10) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (2.0.7) Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (2025.4.26) Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=2.0.0->accelerate) (2.1.3) Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=2.0.0->accelerate) (1.3.0) Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.9/7.9 MB 7.8 MB/s eta 0:00:0000:0100:01m Downloading accelerate-1.8.1-py3-none-any.whl (365 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 365.3/365.3 kB 10.6 MB/s eta 0:00:00 Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 105.0/105.0 MB 9.0 MB/s eta 0:00:0000:01m00:01 Downloading tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl (2.6 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.6/2.6 MB 9.0 MB/s eta 0:00:0000:0100:01m Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl (150.8 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.8/150.8 MB 10.1 MB/s eta 0:00:0000:0100:01 Installing collected packages: torch, bitsandbytes, tokenizers, accelerate, transformers Attempting uninstall: tokenizers Found existing installation: tokenizers 0.21.2 Uninstalling tokenizers-0.21.2: Successfully uninstalled tokenizers-0.21.2 Successfully installed accelerate-1.8.1 bitsandbytes-0.42.0 tokenizers-0.15.2 torch-2.2.2 transformers-4.35.2 Collecting sentence-transformers Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB) Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers) Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.9/40.9 kB 2.2 MB/s eta 0:00:00 Requirement already satisfied: tqdm in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.65.0) Requirement already satisfied: torch>=1.11.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (2.2.2) Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.6.1) Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.11.4) Requirement already satisfied: huggingface-hub>=0.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (0.33.4) Requirement already satisfied: Pillow in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (10.2.0) Requirement already satisfied: typing_extensions>=4.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.14.0) Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (3.13.1) Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2023.6.0) Requirement already satisfied: packaging>=20.9 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (23.1) Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (6.0.1) Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2.32.3) Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (1.1.5) Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (1.12) Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1) Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1.3) Requirement already satisfied: numpy>=1.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (1.26.4) Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (2023.10.3) Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers) Downloading tokenizers-0.21.2-cp39-abi3-macosx_10_12_x86_64.whl.metadata (6.8 kB) Requirement already satisfied: safetensors>=0.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.5.2) Requirement already satisfied: joblib>=1.2.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (1.2.0) Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (3.5.0) Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=1.11.0->sentence-transformers) (2.1.3) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.10) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.7) Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2025.4.26) Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=1.11.0->sentence-transformers) (1.3.0) Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 470.2/470.2 kB 9.6 MB/s eta 0:00:00ta 0:00:01 Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.8/10.8 MB 12.5 MB/s eta 0:00:0000:0100:01 Downloading tokenizers-0.21.2-cp39-abi3-macosx_10_12_x86_64.whl (2.9 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.9/2.9 MB 11.2 MB/s eta 0:00:0000:0100:01 Installing collected packages: tokenizers, transformers, sentence-transformers Attempting uninstall: tokenizers Found existing installation: tokenizers 0.15.2 Uninstalling tokenizers-0.15.2: Successfully uninstalled tokenizers-0.15.2 Attempting uninstall: transformers Found existing installation: transformers 4.35.2 Uninstalling transformers-4.35.2: Successfully uninstalled transformers-4.35.2 Successfully installed sentence-transformers-5.0.0 tokenizers-0.21.2 transformers-4.53.2 Collecting bertopic Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB) Collecting umap-learn Downloading umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB) Collecting hdbscan Downloading hdbscan-0.8.40-cp311-cp311-macosx_10_9_universal2.whl.metadata (15 kB) Requirement already satisfied: adjustText in /opt/anaconda3/lib/python3.11/site-packages (1.3.0) Requirement already satisfied: numpy>=1.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (1.26.4) Requirement already satisfied: pandas>=1.1.5 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (2.1.4) Requirement already satisfied: plotly>=4.7.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (5.9.0) Requirement already satisfied: scikit-learn>=1.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (1.6.1) Requirement already satisfied: sentence-transformers>=0.4.1 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (5.0.0) Requirement already satisfied: tqdm>=4.41.1 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (4.65.0) Requirement already satisfied: llvmlite>0.36.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (0.42.0) Requirement already satisfied: scipy>=1.3.1 in /opt/anaconda3/lib/python3.11/site-packages (from umap-learn) (1.11.4) Requirement already satisfied: numba>=0.51.2 in /opt/anaconda3/lib/python3.11/site-packages (from umap-learn) (0.59.0) Requirement already satisfied: pynndescent>=0.5 in /opt/anaconda3/lib/python3.11/site-packages (from umap-learn) (0.5.13) Requirement already satisfied: joblib>=1.0 in /opt/anaconda3/lib/python3.11/site-packages (from hdbscan) (1.2.0) Requirement already satisfied: matplotlib in /opt/anaconda3/lib/python3.11/site-packages (from adjustText) (3.8.0) Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.11/site-packages (from pandas>=1.1.5->bertopic) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.11/site-packages (from pandas>=1.1.5->bertopic) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in /opt/anaconda3/lib/python3.11/site-packages (from pandas>=1.1.5->bertopic) (2023.3) Requirement already satisfied: tenacity>=6.2.0 in /opt/anaconda3/lib/python3.11/site-packages (from plotly>=4.7.0->bertopic) (8.2.2) Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn>=1.0->bertopic) (3.5.0) Requirement already satisfied: transformers<5.0.0,>=4.41.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (4.53.2) Requirement already satisfied: torch>=1.11.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (2.2.2) Requirement already satisfied: huggingface-hub>=0.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (0.33.4) Requirement already satisfied: Pillow in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (10.2.0) Requirement already satisfied: typing_extensions>=4.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (4.14.0) Requirement already satisfied: contourpy>=1.0.1 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (1.2.0) Requirement already satisfied: cycler>=0.10 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (1.4.4) Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (23.1) Requirement already satisfied: pyparsing>=2.3.1 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (3.0.9) Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (3.13.1) Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2023.6.0) Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (6.0.1) Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.32.3) Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (1.1.5) Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas>=1.1.5->bertopic) (1.16.0) Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (1.12) Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (3.1) Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (3.1.3) Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (2023.10.3) Requirement already satisfied: tokenizers<0.22,>=0.21 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (0.21.2) Requirement already satisfied: safetensors>=0.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (0.5.2) Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (2.1.3) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (3.10) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.0.7) Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2025.4.26) Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (1.3.0) Downloading bertopic-0.17.3-py3-none-any.whl (153 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 153.0/153.0 kB 5.7 MB/s eta 0:00:00 Downloading umap_learn-0.5.9.post2-py3-none-any.whl (90 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 90.1/90.1 kB 7.1 MB/s eta 0:00:00 Downloading hdbscan-0.8.40-cp311-cp311-macosx_10_9_universal2.whl (1.5 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.5/1.5 MB 11.8 MB/s eta 0:00:0000:0100:01 Installing collected packages: hdbscan, umap-learn, bertopic Successfully installed bertopic-0.17.3 hdbscan-0.8.40 umap-learn-0.5.9.post2
In [4]:
!pip install openai --upgrade
Requirement already satisfied: openai in /opt/anaconda3/lib/python3.11/site-packages (1.88.0) Collecting openai Downloading openai-1.95.1-py3-none-any.whl.metadata (29 kB) Requirement already satisfied: anyio<5,>=3.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (4.2.0) Requirement already satisfied: distro<2,>=1.7.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (1.8.0) Requirement already satisfied: httpx<1,>=0.23.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (0.28.1) Requirement already satisfied: jiter<1,>=0.4.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (0.10.0) Requirement already satisfied: pydantic<3,>=1.9.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (2.8.2) Requirement already satisfied: sniffio in /opt/anaconda3/lib/python3.11/site-packages (from openai) (1.3.0) Requirement already satisfied: tqdm>4 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (4.65.0) Requirement already satisfied: typing-extensions<5,>=4.11 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (4.14.0) Requirement already satisfied: idna>=2.8 in /opt/anaconda3/lib/python3.11/site-packages (from anyio<5,>=3.5.0->openai) (3.10) Requirement already satisfied: certifi in /opt/anaconda3/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (2025.4.26) Requirement already satisfied: httpcore==1.* in /opt/anaconda3/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (1.0.7) Requirement already satisfied: h11<0.15,>=0.13 in /opt/anaconda3/lib/python3.11/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0) Requirement already satisfied: annotated-types>=0.4.0 in /opt/anaconda3/lib/python3.11/site-packages (from pydantic<3,>=1.9.0->openai) (0.6.0) Requirement already satisfied: pydantic-core==2.20.1 in /opt/anaconda3/lib/python3.11/site-packages (from pydantic<3,>=1.9.0->openai) (2.20.1) Downloading openai-1.95.1-py3-none-any.whl (755 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 755.6/755.6 kB 11.2 MB/s eta 0:00:00a 0:00:01 Installing collected packages: openai Attempting uninstall: openai Found existing installation: openai 1.88.0 Uninstalling openai-1.88.0: Successfully uninstalled openai-1.88.0 Successfully installed openai-1.95.1
In [5]:
!pip install polars-lts-cpu
Requirement already satisfied: polars-lts-cpu in /opt/anaconda3/lib/python3.11/site-packages (1.31.0)
In [13]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
from bertopic import BERTopic
print("BERTopic imported successfully")
from sentence_transformers import SentenceTransformer
print("SentenceTransformers is working")
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
import openai
PyTorch version: 2.2.2 CUDA available: False BERTopic imported successfully SentenceTransformers is working
In [17]:
import pandas as pd
import re
import torch
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from bertopic import BERTopic
from bertopic.representation import OpenAI, KeyBERTInspired, MaximalMarginalRelevance
# Load API key from .env
from dotenv import load_dotenv
import os
import openai
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
print("Packages loaded successfully.")
Packages loaded successfully.
[nltk_data] Downloading package stopwords to /Users/Condi/nltk_data... [nltk_data] Package stopwords is already up-to-date!
In [19]:
import openai
from dotenv import load_dotenv
import os
# Load API key from .env file
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
# Create OpenAI client for v1.x
client = openai.OpenAI(api_key=api_key)
# Load topic representation
from bertopic.representation import OpenAI as OpenAI_Representation
representation_model = OpenAI_Representation(
client=client,
model="gpt-4o",
delay_in_seconds=10
)
# Prompt
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]
Based on the information above, extract a short topic label in the following format:
topic: <topic label>
"""
In [21]:
import pandas as pd
file_list = [
"Input/gender_Apr25-1.csv"
]
# Read, convert date, and filter in a single line per file
dfs = []
for f in file_list:
df_temp = pd.read_csv(f)
df_temp["event_date"] = pd.to_datetime(df_temp["event_date"])
df_temp = df_temp[df_temp["event_date"] >= "2018-01-01"]
dfs.append(df_temp)
# Concatenate all filtered DataFrames
df = pd.concat(dfs, ignore_index=True)
In [23]:
print("Number of rows after filtering:", len(df))
print("Dates:", df["event_date"].min(), "→", df["event_date"].max())
Number of rows after filtering: 78145 Dates: 2018-01-01 00:00:00 → 2025-04-25 00:00:00
In [25]:
import polars as pl
pl_df = pl.from_pandas(df)
pl_df
Out[25]:
shape: (78_145, 31)
| event_id_cnty | event_date | year | time_precision | disorder_type | event_type | sub_event_type | actor1 | assoc_actor_1 | inter1 | actor2 | assoc_actor_2 | inter2 | interaction | civilian_targeting | iso | region | country | admin1 | admin2 | admin3 | location | latitude | longitude | geo_precision | source | source_scale | notes | fatalities | tags | timestamp |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| str | datetime[ns] | i64 | i64 | str | str | str | str | str | str | str | str | str | str | str | i64 | str | str | str | str | str | str | f64 | f64 | i64 | str | str | str | i64 | str | i64 |
| "ARG16601" | 2025-04-25 00:00:00 | 2025 | 1 | "Demonstrations" | "Protests" | "Peaceful protest" | "Protesters (Argentina)" | "Women (Argentina)" | "Protesters" | null | null | null | "Protesters only" | null | 32 | "South America" | "Argentina" | "Cordoba" | "Punilla" | null | "Capilla del Monte" | -30.8568 | -64.5258 | 1 | "El Diario de Carlos Paz" | "Subnational" | "On 25 April 2025, in Capilla d… | 0 | "crowd size=large" | 1745881584 |
| "BRA96908" | 2025-04-25 00:00:00 | 2025 | 2 | "Political violence" | "Violence against civilians" | "Attack" | "CV: Red Command" | null | "Political militia" | "Civilians (Brazil)" | "Women (Brazil)" | "Civilians" | "Political militia-Civilians" | "Civilian targeting" | 76 | "South America" | "Brazil" | "Bahia" | "Salvador" | null | "Salvador" | -12.9711 | -38.5108 | 1 | "Alo Juca; Bnews (Brazil)" | "Subnational-National" | "Around 25 April 2025 (as repor… | 1 | "women targeted: girls" | 1745881585 |
| "ISR45719" | 2025-04-25 00:00:00 | 2025 | 1 | "Demonstrations" | "Protests" | "Peaceful protest" | "Protesters (Israel)" | "Shift 101; Women (Israel)" | "Protesters" | null | null | null | "Protesters only" | null | 376 | "Middle East" | "Israel" | "Jerusalem" | "Jerusalem" | "Judean Mountains" | "Jerusalem" | 31.769 | 35.2163 | 1 | "Haaretz" | "National" | "On 25 April 2025, about 200 Is… | 0 | "crowd size=about 200" | 1745881590 |
| "MEX103000" | 2025-04-25 00:00:00 | 2025 | 1 | "Political violence" | "Violence against civilians" | "Attack" | "Unidentified Armed Group (Mexi… | null | "Political militia" | "Civilians (Mexico)" | "Labor Group (Mexico); Women (M… | "Civilians" | "Political militia-Civilians" | "Civilian targeting" | 484 | "North America" | "Mexico" | "Guanajuato" | "Leon" | null | "Leon de los Aldama" | 21.122 | -101.6832 | 1 | "Zona Franca" | "Subnational" | "On 25 April 2025, in Leon de l… | 1 | null | 1745881592 |
| "MEX103223" | 2025-04-25 00:00:00 | 2025 | 1 | "Political violence" | "Violence against civilians" | "Attack" | "Unidentified Gang (Mexico)" | null | "Political militia" | "Civilians (Mexico)" | "Women (Mexico)" | "Civilians" | "Political militia-Civilians" | "Civilian targeting" | 484 | "North America" | "Mexico" | "Veracruz de Ignacio de la Llav… | "Coxquihui" | null | "Sabanas de Xalostoc" | 20.2216 | -97.5349 | 1 | "Imagen del Golfo" | "Subnational" | "On 25 April 2025, in Sabanas d… | 2 | "women targeted: relatives of t… | 1745881593 |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| "VEN8468" | 2018-01-01 00:00:00 | 2018 | 2 | "Political violence; Demonstrat… | "Protests" | "Excessive force against protes… | "Protesters (Venezuela)" | "Women (Venezuela)" | "Protesters" | "Military Forces of Venezuela (… | null | "State forces" | "State forces-Protesters" | "Civilian targeting" | 862 | "South America" | "Venezuela" | "Distrito Capital" | "Libertador" | null | "Caracas - Libertador" | 10.5127 | -66.9129 | 1 | "Venezuelanalysis.com" | "International" | "Around 1 January 2018 (as repo… | 1 | "crowd size=no report; women ta… | 1675724415 |
| "DRC11890" | 2018-01-01 00:00:00 | 2018 | 1 | "Political violence" | "Violence against civilians" | "Attack" | "Nyatura Militia (Kasongo)" | null | "Identity militia" | "Civilians (Democratic Republic… | "Women (Democratic Republic of … | "Civilians" | "Identity militia-Civilians" | "Civilian targeting" | 180 | "Middle Africa" | "Democratic Republic of Congo" | "Nord-Kivu" | "Walikale" | "Walikale" | "Walikale" | -1.428 | 28.073 | 1 | "Kivu Security Tracker" | "Other" | "On 1 January 2018, an 18 year … | 1 | null | 1694477354 |
| "MEX26708" | 2018-01-01 00:00:00 | 2018 | 3 | "Political violence" | "Violence against civilians" | "Sexual violence" | "Police Forces of Mexico (2012-… | null | "State forces" | "Civilians (Mexico)" | "Women (Mexico)" | "Civilians" | "State forces-Civilians" | "Civilian targeting" | 484 | "North America" | "Mexico" | "Nuevo Leon" | "San Pedro Garza Garcia" | null | "San Pedro Garza Garcia" | 25.6652 | -100.4025 | 1 | "El Norte" | "Subnational" | "On January 2018 in San Pedro G… | 0 | null | 1702344316 |
| "DRC11889" | 2018-01-01 00:00:00 | 2018 | 1 | "Political violence" | "Violence against civilians" | "Sexual violence" | "Unidentified Armed Group (Demo… | null | "Political militia" | "Civilians (Democratic Republic… | "Women (Democratic Republic of … | "Civilians" | "Political militia-Civilians" | "Civilian targeting" | 180 | "Middle Africa" | "Democratic Republic of Congo" | "Sud-Kivu" | "Uvira" | "Uvira" | "Mulima" | -3.3819 | 29.1395 | 1 | "Kivu Security Tracker" | "Other" | "On 1 January 2018, unidentifie… | 0 | "women targeted: girls" | 1705978242 |
| "SYR18080" | 2018-01-01 00:00:00 | 2018 | 1 | "Political violence" | "Violence against civilians" | "Attack" | "Islamic State in Iraq and the … | null | "Rebel group" | "Civilians (Syria)" | "Prisoners (Syria); Women (Syri… | "Civilians" | "Rebel group-Civilians" | "Civilian targeting" | 760 | "Middle East" | "Syria" | "Deir ez Zor" | "Deir ez Zor" | "Deir ez Zor" | "Deir-ez-Zor" | 35.3319 | 40.1461 | 3 | "SNHR; SOHR" | "Local partner-Other" | "A woman was executed by Islami… | 1 | null | 1730157026 |
In [27]:
import polars as pl
import pandas as pd
import re
# Get unique list of places and create the pattern
place_cols = ["country", "location", "admin1", "admin2", "admin3"]
place_series = [pl_df[col].drop_nulls().unique() for col in place_cols]
places = set()
for s in place_series:
places.update(s.to_list())
places = [l.strip() for l in places if isinstance(l, str)]
pattern = r'\b(' + '|'.join(map(re.escape, places)) + r')\b'
# Apply cleaning directly in Polars
pl_df_clean = pl_df.with_columns(
pl.col("notes")
.cast(pl.String)
.str.replace_all(pattern, "")
.str.replace_all(r"\b(19|20)\d{2}\b", "")
.alias("notes_clean")
)
# Extract the final list
documents = pl_df_clean["notes_clean"].drop_nulls().to_list()
In [29]:
titles = df["event_id_cnty"].dropna().tolist()
In [31]:
import pandas as pd
import polars as pl
def glimpse_polars(df: pl.DataFrame, max_cols=100, max_rows=5):
print(f"Observations: {df.height:,}")
print(f"Variables: {df.width:,}")
print("-" * 100)
col_info = []
for i, col_name in enumerate(df.columns):
if i < max_cols:
col_series = df.get_column(col_name)
dtype = col_series.dtype
non_null_count = col_series.len() - col_series.null_count()
unique_count = col_series.n_unique()
sample_values = col_series.drop_nulls().unique().slice(0, max_rows).to_list()
col_info.append({
"Variable": col_name,
"Type": dtype,
"Non-Null": f"{non_null_count:,}",
"Unique": f"{unique_count:,}",
"Sample Values": sample_values
})
col_info_df = pd.DataFrame(col_info)
print(col_info_df.to_string(index=False, max_colwidth=100))
if df.width > max_cols:
print(f"\n... and {df.width - max_cols} more variables")
print("-" * 100)
glimpse_polars(pl_df_clean)
Observations: 78,145
Variables: 32
----------------------------------------------------------------------------------------------------
Variable Type Non-Null Unique Sample Values
event_id_cnty String 78,145 78,145 [YEM96399, IND52439, TUR9336, IND132953, PER1864]
event_date Datetime(time_unit='ns', time_zone=None) 78,145 2,672 [2018-01-01 00:00:00, 2018-01-02 00:00:00, 2018-01-03 00:00:00, 2018-01-04 00:00:00, 2018-01-05 0...
year Int64 78,145 8 [2018, 2019, 2020, 2021, 2022]
time_precision Int64 78,145 3 [1, 2, 3]
disorder_type String 78,145 4 [Strategic developments, Political violence, Political violence; Demonstrations, Demonstrations]
event_type String 78,145 5 [Explosions/Remote violence, Strategic developments, Violence against civilians, Riots, Protests]
sub_event_type String 78,145 16 [Attack, Mob violence, Other, Peaceful protest, Remote explosive/landmine/IED]
actor1 String 78,145 1,760 [Police Forces of Peru (2020-2020), Police Forces of Burundi (2005-), Police Forces of Indonesia ...
assoc_actor_1 String 58,855 12,468 [Chippewa Tribal Group (United States); DEM: Democratic Party; Government of the United States (2...
inter1 String 78,145 8 [Political militia, Identity militia, Protesters, Rebel group, Civilians]
actor2 String 26,328 560 [Civilians (Haiti), Private Security Forces (Moldova), Civilians (Guinea-Bissau), Civilians (Taiw...
assoc_actor_2 String 23,284 3,483 [Farmers (Colombia); Women (Colombia), Women (East Timor), Journalists (Turkmenistan); Women (Tur...
inter2 String 26,328 8 [External/Other forces, State forces, Protesters, Rioters, Political militia]
interaction String 78,145 22 [Rebel group-Civilians, Rebel group-Rioters, Protesters only, Rioters-Civilians, Political militi...
civilian_targeting String 20,541 2 [Civilian targeting]
iso Int64 78,145 195 [0, 4, 8, 12, 20]
region String 78,145 16 [Oceania, Caribbean, Europe, Western Africa, South America]
country String 78,145 195 [Spain, Bosnia and Herzegovina, Honduras, Malta, Democratic Republic of Congo]
admin1 String 78,145 2,118 [Baringo, Loreto, Grodno, Mayabeque, Loja]
admin2 String 76,250 9,590 [Edirne, Bushehr, Sao Jose dos Quatro Marcos, Atrato, Bangi]
admin3 String 29,184 5,927 [Midebdo, Kopera, Tirioko, Jamalpur, Dimow]
location String 78,145 19,306 [Tlacuilotepec, Ciales, Hunucma, Ah Htet Ngar Nan, Sanok]
latitude Float64 78,145 19,693 [-54.8062, -54.5119, -53.7865, -53.1548, -51.7308]
longitude Float64 78,145 19,856 [-171.7553, -161.7558, -159.7804, -159.3721, -158.4575]
geo_precision Int64 78,145 3 [1, 2, 3]
source String 78,145 15,326 [Aydinlik; Bianet; Evrensel; Sendika.org, N12, Daily Independent (Nigeria); Daily Trust (Nigeria)...
source_scale String 78,145 26 [New media-International, Other-New media, Regional, New media-Regional, Other-National]
notes String 78,145 77,911 [On 16 August 2019, teachers and their families including women staged a protest in Agartala city...
fatalities Int64 78,145 47 [0, 1, 2, 3, 4]
tags String 62,222 2,692 [crowd size=more than 530, crowd size=10,000 - 20,000, crowd size=approximately 35, counter-demon...
timestamp Int64 78,145 14,517 [1559160369, 1559160524, 1559160525, 1559160527, 1559160529]
notes_clean String 78,145 72,931 [On 13 October , women (likely ) staged a protest in district (coded to town, ) and blocked a c...
----------------------------------------------------------------------------------------------------
In [33]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI
# Embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = embedding_model.encode(documents, show_progress_bar=True)
# UMAP for dimensionality reduction (5D for clustering)
umap_model = UMAP(
n_neighbors=40,
n_components=5,
min_dist=0.4,
metric='cosine',
random_state=42
)
# HDBSCAN for clustering
hdbscan_model = HDBSCAN(
min_cluster_size=40,
min_samples=15,
metric='euclidean',
cluster_selection_method='eom',
prediction_data=True
)
# Dimensionality reduction for visualization (2D)
reduced_embeddings = UMAP(
n_neighbors=15,
n_components=2,
min_dist=0.0,
metric='cosine',
random_state=42
).fit_transform(embeddings)
# Stopwords and vectorization
stopwords_total = set(stopwords.words("spanish") + stopwords.words("english"))
vectorizer_model = CountVectorizer(stop_words=list(stopwords_total))
# Semantic representations
representation_model = {
"KeyBERT": KeyBERTInspired(),
"MMR": MaximalMarginalRelevance(diversity=0.3),
"OpenAI": OpenAI(
client=client,
model="gpt-4o",
prompt=prompt
)
}
Batches: 0%| | 0/2443 [00:00<?, ?it/s]
In [36]:
topic_model = BERTopic(
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
representation_model=representation_model,
top_n_words=10,
verbose=True
)
# Fit the model and transform the documents
topics, probs = topic_model.fit_transform(documents, embeddings)
2025-07-14 18:09:35,917 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm 2025-07-14 18:11:11,761 - BERTopic - Dimensionality - Completed ✓ 2025-07-14 18:11:11,763 - BERTopic - Cluster - Start clustering the reduced embeddings huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) 2025-07-14 18:11:15,792 - BERTopic - Cluster - Completed ✓ 2025-07-14 18:11:15,801 - BERTopic - Representation - Fine-tuning topics using representation models. 100%|██████████| 255/255 [06:10<00:00, 1.45s/it] 2025-07-14 18:17:46,001 - BERTopic - Representation - Completed ✓
In [37]:
# Basic topic information
print(topic_model.get_topic_info())
# Keywords of a specific topic
print(topic_model.get_topic(0))
# Visualization
topic_model.visualize_topics()
Topic Count Name \
0 -1 35963 -1_women_group_march_protest
1 0 1942 0_found_body_fatality_wrapped
2 1 1410 1_conference_press_picketed_urged
3 2 1310 2_meitei_meira_tribal_communities
4 3 971 3_femicide_justice_demand_femicides
.. ... ... ...
250 249 41 249_hijab_religious_sociopolitical_veil
251 250 41 250_coordinated_restrictions_abortions_mostly
252 251 40 251_morality_infringing_iranian_hijab
253 252 40 252_akoko_fulani_lga_pastoralists
254 253 40 253_femicide_case_town_recent
Representation \
0 [women, group, march, protest, international, ...
1 [found, body, fatality, wrapped, tied, reporte...
2 [conference, press, picketed, urged, seoul, ko...
3 [meitei, meira, tribal, communities, amidst, p...
4 [femicide, justice, demand, femicides, case, f...
.. ...
250 [hijab, religious, sociopolitical, veil, guida...
251 [coordinated, restrictions, abortions, mostly,...
252 [morality, infringing, iranian, hijab, died, c...
253 [akoko, fulani, lga, pastoralists, ruler, comm...
254 [femicide, case, town, recent, occurred, targe...
KeyBERT \
0 [demonstrators, protesters, protested, protest...
1 [strangled, corpse, decapitated, fatalities, d...
2 [picketed, pickets, solidarity, kctu, committe...
3 [protest, activists, militants, meitei, arrest...
4 [victims, protested, protesters, protest, acti...
.. ...
250 [hijabs, hijab, sharia, islamic, police, arres...
251 [activists, protesting, abortions, rally, wome...
252 [protesters, protestors, protests, hijab, iran...
253 [protested, grievance, ekoko, akoko, attacks, ...
254 [protest, protesting, activists, women, group,...
MMR \
0 [group, march, protest, day, woman, police, pr...
1 [found, fatality, wrapped, colonia, torture, w...
2 [conference, picketed, seoul, banners, harassm...
3 [meitei, tribal, paibi, torch, apunba, state, ...
4 [femicide, justice, cases, victims, feminist, ...
.. ...
250 [hijab, sociopolitical, veil, forbidding, isla...
251 [coordinated, restrictions, abortions, protest...
252 [morality, iranian, hijab, rules, iranians, ve...
253 [akoko, fulani, pastoralists, protested, axis,...
254 [femicide, town, cases, protest, pozcu, incide...
OpenAI \
0 [Protests and Violence Against Women]
1 [Women's Homicides with Bodies Found Wrapped i...
2 [Protests Against Sexual Harassment and Advoca...
3 [Meira Paibi Protests Amidst Meitei-Tribal Con...
4 [Protests for Justice in Femicide Cases]
.. ...
250 [Enforcement of Hijab by Iranian Morality Poli...
251 [Women's Protest Against Abortion Restrictions]
252 [Protests Against Hijab Rules and Mahsa Amini'...
253 [Protests Against Insecurity and Fulani Pastor...
254 [Protests Against Femicide by Women's Groups]
Representative_Docs
0 [On 8 March , in , , around 50 women from vari...
1 [Around 14 February (as reported), in , , a w...
2 [On 16 November , members from the National o...
3 [On 11 August , local Meira Paibi (likely Meit...
4 [On 16 January , in (), 100 people, including...
.. ...
250 [Other: On 18 June , Iranian Guidance Patrol p...
251 [On 30 October , activists, mostly women, gath...
252 [On 22 October , protestors, including Iranian...
253 [On 15 January , hundreds of women from the Ak...
254 [On 14 January , a women group staged a protes...
[255 rows x 8 columns]
[('found', 0.066610601124112), ('body', 0.06484559077992424), ('fatality', 0.03838972406268369), ('wrapped', 0.03519847420814442), ('tied', 0.033365548692720605), ('reported', 0.03319917444221136), ('colonia', 0.031008692351572328), ('plastic', 0.026284319891301675), ('signs', 0.025845700064423587), ('killed', 0.024522933445648536)]
In [42]:
topic_model.get_topic_info()
Out[42]:
| Topic | Count | Name | Representation | KeyBERT | MMR | OpenAI | Representative_Docs | |
|---|---|---|---|---|---|---|---|---|
| 0 | -1 | 35963 | -1_women_group_march_protest | [women, group, march, protest, international, ... | [demonstrators, protesters, protested, protest... | [group, march, protest, day, woman, police, pr... | [Protests and Violence Against Women] | [On 8 March , in , , around 50 women from vari... |
| 1 | 0 | 1942 | 0_found_body_fatality_wrapped | [found, body, fatality, wrapped, tied, reporte... | [strangled, corpse, decapitated, fatalities, d... | [found, fatality, wrapped, colonia, torture, w... | [Women's Homicides with Bodies Found Wrapped i... | [Around 14 February (as reported), in , , a w... |
| 2 | 1 | 1410 | 1_conference_press_picketed_urged | [conference, press, picketed, urged, seoul, ko... | [picketed, pickets, solidarity, kctu, committe... | [conference, picketed, seoul, banners, harassm... | [Protests Against Sexual Harassment and Advoca... | [On 16 November , members from the National o... |
| 3 | 2 | 1310 | 2_meitei_meira_tribal_communities | [meitei, meira, tribal, communities, amidst, p... | [protest, activists, militants, meitei, arrest... | [meitei, tribal, paibi, torch, apunba, state, ... | [Meira Paibi Protests Amidst Meitei-Tribal Con... | [On 11 August , local Meira Paibi (likely Meit... |
| 4 | 3 | 971 | 3_femicide_justice_demand_femicides | [femicide, justice, demand, femicides, case, f... | [victims, protested, protesters, protest, acti... | [femicide, justice, cases, victims, feminist, ... | [Protests for Justice in Femicide Cases] | [On 16 January , in (), 100 people, including... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 250 | 249 | 41 | 249_hijab_religious_sociopolitical_veil | [hijab, religious, sociopolitical, veil, guida... | [hijabs, hijab, sharia, islamic, police, arres... | [hijab, sociopolitical, veil, forbidding, isla... | [Enforcement of Hijab by Iranian Morality Poli... | [Other: On 18 June , Iranian Guidance Patrol p... |
| 251 | 250 | 41 | 250_coordinated_restrictions_abortions_mostly | [coordinated, restrictions, abortions, mostly,... | [activists, protesting, abortions, rally, wome... | [coordinated, restrictions, abortions, protest... | [Women's Protest Against Abortion Restrictions] | [On 30 October , activists, mostly women, gath... |
| 252 | 251 | 40 | 251_morality_infringing_iranian_hijab | [morality, infringing, iranian, hijab, died, c... | [protesters, protestors, protests, hijab, iran... | [morality, iranian, hijab, rules, iranians, ve... | [Protests Against Hijab Rules and Mahsa Amini'... | [On 22 October , protestors, including Iranian... |
| 253 | 252 | 40 | 252_akoko_fulani_lga_pastoralists | [akoko, fulani, lga, pastoralists, ruler, comm... | [protested, grievance, ekoko, akoko, attacks, ... | [akoko, fulani, pastoralists, protested, axis,... | [Protests Against Insecurity and Fulani Pastor... | [On 15 January , hundreds of women from the Ak... |
| 254 | 253 | 40 | 253_femicide_case_town_recent | [femicide, case, town, recent, occurred, targe... | [protest, protesting, activists, women, group,... | [femicide, town, cases, protest, pozcu, incide... | [Protests Against Femicide by Women's Groups] | [On 14 January , a women group staged a protes... |
255 rows × 8 columns
In [44]:
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
for topic_id, label_info in openai_topics.items():
label = label_info[0][0].split("\n")[0]
print(f"Topic {topic_id:>2}: {label}")
Topic -1: Protests and Violence Against Women Topic 0: Women's Homicides with Bodies Found Wrapped in Blankets Topic 1: Protests Against Sexual Harassment and Advocacy for Gender Equality in South Korea Topic 2: Meira Paibi Protests Amidst Meitei-Tribal Conflict Topic 3: Protests for Justice in Femicide Cases Topic 4: Protests and Unrest Following Mahsa Amini's Death Topic 5: Abortion Rights Protest against Supreme Court Decision to Overturn Roe v. Wade Topic 6: Political Protests and Clashes in India Involving BJP, INC, AIMC Over Women's Safety and Governance Issues Topic 7: Abductions and Ransom Demands in Conflict Zones Topic 8: Anganwadi Workers' Protest for Regularisation and Salary Increase Topic 9: Monthly Flower Demonstrations Supporting MeToo Movement Against Sexual Violence Topic 10: Military Arrests and Detentions in Township Regions Topic 11: Protests Against Gender-Based Violence on International Day Topic 12: Abductions and Attacks by Presumed ISWAP or Boko Haram Militants Topic 13: Comfort Women Protests in Front of Former Japanese Embassy in Seoul Topic 14: Suppression of Female Petitioners and Human Rights Defenders Topic 15: Global Protests for Abortion Legalization on September 28th Topic 16: Gun Violence Awareness and Advocacy Topic 17: Protests Against the Death of Mahsa Amini by Iranian Communities Topic 18: Labor Protests and Worker Rights for Domestic and Health Workers Topic 19: Drive-by Shootings by Armed Motorcyclists Targeting Women Topic 20: Nationwide Protests Against Trump Administration Policies Topic 21: Nationwide Protests Against Transphobic Legislation and Media Topic 22: Women's Rights Activism in Sweden Topic 23: ASHA Health Workers' Wage Protests in J&K Topic 24: Protests Against Japanese Cabinet Over Article 9 Amendment Topic 25: Witchcraft-related Attacks and Mob Violence in Villages Topic 26: Student Protests for Justice for Mahsa Amini and Anti-Government Sentiments Topic 27: Violence Against Transgender Women and Hate Crimes Topic 28: Women's Protests Against Military Coup and Support for Civil Disobedience Movement Topic 29: Protests Against Far-Right Extremism and AfD Topic 30: Women Protesting Water Supply Shortages in J&K Topic 31: Black Lives Matter Movement and Protests against Police Brutality Topic 32: Taliban's Violations and Abuse Against Women Topic 33: Ambazonian Separatists' Abductions and Violence Against Civilians Topic 34: Sexual Violence by Military Personnel in African Regions Topic 35: Drug Trafficking-Related Homicides of Women Topic 36: Political Violence in West Bengal Elections Topic 37: Mothers' Protests for Missing Relatives on Mother's Day Topic 38: Armed Attacks on Women in Colonia Areas Topic 39: Afghan Women's Protests Against Taliban Policies Topic 40: Houthi-Sponsored Protests Against Israeli Actions in Solidarity with Palestinians Topic 41: Extinction Rebellion and Climate Action Protests Topic 42: Police Brutality Against Women Topic 43: Protests Against Isolation and Visitor Ban of PKK Leader Abdullah Ocalan Topic 44: Protests Against Gender-Based Violence in Universities Topic 45: Protests for Social Support and Subsidized Housing for Low-Income Families in Kazakhstan Topic 46: Red Dress Day and MMIW Awareness Events Topic 47: Women's Rights Demonstrations Against Violence Topic 48: Pro-Palestinian Protests for Ceasefire amid Israel-Hamas Conflict Topic 49: Pro-Choice Protests Following Roe v. Wade Overturn Topic 50: Women's March and Advocacy for Rights Topic 51: Women's Strike Protests Against Abortion Restrictions Topic 52: Military Violence and Arson in Villages Topic 53: Mob Violence Against Women Suspected of Child Lifting Topic 54: Protests Against Citizenship (Amendment) Act and NRC by Women Topic 55: Israeli-Palestinian Settler Violence and Injuries Topic 56: Parent Protests Over School Management and Conditions Topic 57: Police Misconduct and Sexual Assault in Custody Topic 58: Mysterious Deaths of Women with Burned Bodies Found Topic 59: Protests Against Gender-Based Violence (GBV) During 16 Days of Activism Topic 60: Ukrainian Protests Against Russian Invasion Topic 61: Motorcycle Drive-by Shootings Targeting Women Topic 62: Hijab Protests in Educational Institutions Topic 63: Protests by Female College Students Over Hostel Conditions Topic 64: Protests Against Sexual Harassment in Wrestling Federation Topic 65: Sexual Assault and Violence by Imbonerakure Members Topic 66: Houthi Sniper Attacks on Civilians in Yemen Topic 67: Monthly Flower Demonstrations Against Sexual Violence Acquittals Topic 68: Sexual Violence and Killings by Military Forces in Conflict Zones Topic 69: National Strike and Demonstration Against Austerity and for Economic and Gender Equality Topic 70: Sexual Violence and Attacks by Militias in Darfur Topic 71: Military Police Officers' Families Protest Over Unpaid Salaries Topic 72: Protests Against Bolsonaro and Gender-Based Violence in Brazil Topic 73: Global Protests for Gender Equality on International Women's Day Topic 74: Protests Against Dismissal of DEM Party Mayor Mehmet Siddik Akis Topic 75: Protests Against Dismissal of HDP Mayors Topic 76: Protests Against Netanyahu's Judicial Overhaul Topic 77: Unidentified Armed Group Attacks on Women Topic 78: Protests Against Farm Laws by SKM and Women Farmers Topic 79: Student Protests Against School Handling of Sexual Misconduct Topic 80: International Day for the Elimination of Violence against Women Protest Flash Mobs Topic 81: Female Traders' Protests Against Market Changes Topic 82: Abortion Rights Protests Addressing Roe v. Wade Overturn Topic 83: Protests against Death of Kurdish Woman in Iranian Police Custody by Swedish-Iranian Community Topic 84: Off-Duty Female Police Officer Shootings Topic 85: Protests Against Rape and Murder of Woman Veterinary Doctor Topic 86: Caste-Based Sexual Violence in Uttar Pradesh Topic 87: Women's Protests for Political Prisoners' Immediate Release in May by February 14 Youth Coalition Topic 88: Women's Protests Against Liquor Shops in Villages Topic 89: International Women's Day Marches and Protests Topic 90: Women's Strike Movement Protesting Abortion Law Restrictions in November Topic 91: Women's Rights Activists Protests for Hostage Release Topic 92: Saturday Mothers' Weekly Protests for Justice and Missing Relatives Topic 93: Women's March Protesting Amy Coney Barrett's Supreme Court Nomination Topic 94: Protests Against Turkish Military Operations in Kurdish Regions Topic 95: Violent Attacks by Suspected Fulani Pastoralists on Farmers and Women Topic 96: Women's Rights Demonstrations in Dalarna Topic 97: Women's Rights and International Women's Day Protests Topic 98: Women's Protests Against Abortion Restrictions Topic 99: Demonstrations Opposing the Overturn of Roe v. Wade Topic 100: Gang-Related Fatalities of Women Topic 101: Protests Against Macron's Appointment of Barnier as Prime Minister Topic 102: Women's Rights Rally Against Abortion Restrictions Topic 103: Supreme Court and Senate Bill 8 Abortion Protests Topic 104: Women's Rights Protest Against Abortion Restrictions in the UK Topic 105: Protests Against Rape and Violence Against Women in Bangladesh Topic 106: Turkish Government's Withdrawal from Women's Rights Convention Topic 107: Sexual Violence by RSF Against Civilians Topic 108: Detainees' Mothers Association Protests Topic 109: University Fraternity Sexual Misconduct Protests Topic 110: Alleged Poisoning and Mass Sociogenic Illness in Schools Topic 111: Supreme Court Protests Against Leaked Draft to Overturn Roe v. Wade Topic 112: Abortion Rights Protests in Response to Leaked Supreme Court Draft on Roe v. Wade Topic 113: Women's Rights Protests Against Withdrawal from Violence Prevention Convention Topic 114: Protests Against Smart Power Meters and Electricity Issues Topic 115: Protests Against Hijab Enforcement in Iran Topic 116: University Protests Against Police Violence and Detention Practices Topic 117: Vanessa Guillen Vigil Protests and Justice Movement Topic 118: Take Back the Night Marches Against Gender-Based Violence Topic 119: Women's Strike Protests Against Abortion Restrictions Topic 120: Nationwide Strikes and Demonstrations for Gender Equality on International Women's Day Topic 121: Armed Attacks on Female Mayoral Candidates Topic 122: Student Protests Against Leaked Supreme Court Draft on Abortion Rights Topic 123: Abortion Rights and Anti-Abortion Protests Topic 124: Protests by Saturday Mothers for Justice for Missing Detainees Topic 125: Protests for Health Services for Ill Prisoners Topic 126: Nationwide Flower Demonstrations Against Sexual Violence Acquittals Topic 127: Violence Against Female Petitioners in China Topic 128: QSD Detainment of Women in Countryside for Unknown Reasons Topic 129: PKK-Affiliated Youth Movement Kidnapping Girls for Conscription in Countryside Areas Topic 130: Murle-Lou Nuer Conflict and Violence Topic 131: Pro-Choice Demonstrations in Response to Dobbs v. Jackson Draft Leak Topic 132: International Women's Day Protests Against Gender Violence and Inequality Topic 133: ADF Attacks on Civilians in Villages Topic 134: Government Actions Against Ladies in White Protests Topic 135: Women's Rights Demonstrations in Gothenburg by Kvinnostrejk Movement Topic 136: Election Protests and Claims of Rigging by PDP Women Topic 137: Demonstrations Supporting Rule of Law Against Far-Right Political Influence Topic 138: Unsolved Fatal Shootings of Women by Unidentified Perpetrators Topic 139: Breast Cancer Awareness and Healthcare Challenges Topic 140: Women's Protests Against HTS for Detainee Release in Countryside Topic 141: Nationwide Protests Against Inflation and Corruption on July 27th Topic 142: Protest Against Gender-Based Violence and Child Disappearance Topic 143: One Billion Rising Protest Against Gender Violence Topic 144: Nepali Congress Anti-Government Demonstrations on September 24 Topic 145: Violence Involving FARC Dissidents and ELN in Rural Areas Topic 146: Protests Against Violence and Femicides on International Day for the Elimination of Violence Against Women Topic 147: Kurdish Newroz Celebrations and Political Demands Topic 148: Mahsa Amini Mourning Observance Protests on University Campuses Topic 149: Protests Over Luis Rubiales' Non-consensual Kiss in Women's World Cup Topic 150: Korean Farmers' Advocacy and Government Policy Criticism Topic 151: Sunni Baloch Protests Against Government Violence Topic 152: International Women's Day Demonstrations for Gender Equality and Women's Rights Topic 153: Protests Against Gender Violence on International Day for Elimination of Violence against Women Topic 154: Women's Rights Demonstrations and Pension Reform Protests on International Women's Day Topic 155: Al Shabaab Attacks on Women Accused of Government Collaboration Topic 156: Sexual Violence and Murder by Unidentified Armed Groups in Conflict Zones Topic 157: International Women's Day Protests Against Femicides and for Women's Rights Topic 158: Women's Rights and Gender Violence Protest on International Women's Day Topic 159: Feminist Protests Against Gender-Based Violence in November Topic 160: Opposition to Alcohol Legalisation in Manipur Topic 161: Houthi-Sponsored Protest in Solidarity with Palestinians Against Zionist Actions Topic 162: Women's Abortion Rights Protests in October Coordinated by Women's Strike Topic 163: International Women's Day Demonstrations for Gender Equality and Equal Pay Topic 164: Advocacy for Amending Prostitution Punishment Laws Topic 165: Women’s Protests Against Police Misconduct and Recruitment Practices Topic 166: Protest Against Quran Burning by Far-Right Politician Topic 167: Dahalo-Related Abductions and Attacks in Madagascar Topic 168: Alleged School Poisonings and Mass Hysteria in Schools Topic 169: Voting Rights and Capitol Riot Remembrance Events Topic 170: Targeted Property Attacks and Vandalism Incidents Topic 171: Women's Rights and Gender Equality in the Catholic Church Topic 172: Protests for Justice and Prevention of Child Murders Topic 173: Women's Strike Protests Against Abortion Restrictions Topic 174: Gang Violence and Sexual Assault Amid Clashes in Commune Areas Topic 175: Protests for Justice for Nirmala Pant Rape and Murder Topic 176: Protest Against Valeri Simeonov by Mothers of Children with Disabilities Topic 177: International Women's Day Protests Against Femicides Topic 178: Protests Against Violence in Arab Israeli Community Topic 179: Protests Supporting Palestinian Prisoners in Israeli Jails Topic 180: Women's Strike Movement Protest Against Abortion Restrictions Topic 181: Kidnappings and Ransom Demands in Nigeria Topic 182: Alleged Poisoning of Female Students and Government Involvement Protest Topic 183: RLD Protests Against Communal Violence and Demand for President's Rule Topic 184: Women's March Against Gender-Based Violence Topic 185: Protest Against Macron's Defense of Gerard Depardieu Amid Rape and Harassment Accusations Topic 186: Houthi-Sponsored Protests in Solidarity with Palestine Topic 187: International Women's Day Marches for Women's Rights Topic 188: Drug Trafficking Tribunals and Victim Abductions Topic 189: Nationwide Strike by Left Trade Unions in Protest Against Economic Policies Topic 190: Advocacy for Comprehensive Legal Reform on Sexual Violence in Support of Gisele Pelicot Topic 191: Protests Against Iranian Hijab Laws and Women's Rights支持 Topic 192: Protests Commemorating EDSA People Power Revolution and Opposing Charter Change in the Philippines Topic 193: Solidarity Protests Against Indian Government's Actions in Kashmir Topic 194: Taliban Restrictions on Women's Media and Public Presence Topic 195: Student Protests Against Government Following Death in Police Custody Topic 196: Meitei Community Protests and Government Response in Tribal Conflict Topic 197: Houthi-Sponsored Demonstrations in Support of Palestinians and Against US-Zionist Actions Topic 198: Women's Protest Against Abortion Restrictions Topic 199: CODECO-URDPC Attacks on Civilians and Looting Topic 200: Protest Against Attack on Jatiya Parishad's State President Topic 201: Mob Justice and Police Intervention in Lynching Incidents Topic 202: Detention and Re-education of Uyghur Women in China Topic 203: Women's Equality Protest Rally Topic 204: Protests Over Mahsa Amini's Death in Police Custody Topic 205: Houthi-Sponsored Protest in Solidarity with Palestinians and Commemoration of Late President Saleh Ali Al Samad Topic 206: Protests Against COVID-19 Maternity Regulations Topic 207: Women's Strike Movement Protesting Abortion Restrictions Topic 208: Protests and Demonstrations in Prisons Regarding Conditions and Visitation Rights Topic 209: Women's Rights Protests for Gender Equality and Abortion Legalization Topic 210: Abuse of Female Political Prisoners by Guards Topic 211: Women's Rights and Abortion Restrictions Protest Topic 212: Protests Against Violence Targeting Women by HDP and TJA Members Topic 213: Vigilante-Style Attacks on Female Drug Suspects in the Philippines Topic 214: Student Protests Against Rape and Violence in Universities Topic 215: Detention of Women by Turkish and Syrian Intelligence Topic 216: Political Protests Against Arrests in Pakistan Topic 217: Female Teachers' Protests for Salary and Rights in Education Sector Topic 218: Protests Against Violence Towards Women in Response to Pelicot Trial Topic 219: Women's Rights and Anti-Patriarchy Protest Movement Topic 220: Anti-Nuclear Protests and Commemoration Movement Topic 221: Nationwide Protests Over Doctor's Rape-Murder Case Topic 222: Protests Against NIA Case on Arambai Tenggol's Chief Amidst Meitei-Tribal Violence Topic 223: Protests Against Femicide Commemorating Giulia Cecchettin Topic 224: Abortion Rights Protests and Counter-Protests Involving Catholic Groups Topic 225: Protests in Response to Death of Woman in Modesty Police Custody Topic 226: Protest Against Government Apathy on Migrant Laborer Deaths During Pandemic Topic 227: Protests Against Government Pressure on Independent Media and Foreign Ownership Restrictions Topic 228: Militia Attacks and Abductions in Villages Topic 229: Houthi-Sponsored Protest on Zayd ibn Ali's Death Anniversary Topic 230: Polish Abortion Law Protests and Solidarity Movements Topic 231: Violence Against Indigenous Women and Leaders Topic 232: Demonstrations for Missing Persons Under State Detention Topic 233: Nationwide Farmer Protest Against Farm Laws Topic 234: Women's Protests Against Judicial Reforms Inspired by "The Handmaid's Tale" Topic 235: Protests for Search of Remains of Missing Indigenous Women at Landfills Topic 236: Protest Against Environmental Impact of Iron Sand Mining Topic 237: Protests for Reinvestigation of Drug Haul Cases Topic 238: Protests Against Sexual Violence in District (Country) Topic 239: Anti-War Protests and Peace Advocacy by Codepink Topic 240: Protest Against Agriculture Minister Pradeep Maharathy for Derogatory Remarks by BJP Mahila Morcha Topic 241: International Women's Day Protests for Gender Equality and Against Gender-Based Violence Topic 242: Houthi Protest Commemorating Tanumah Massacre and Supporting Palestinian Solidarity Topic 243: Anti-Harassment Protests and Accountability in Media & Local Government Topic 244: Barangay Leadership Attacks by Unidentified Assailants Topic 245: Women's Rights Protests and Anti-Gender Violence Performances Topic 246: Demonstrations on Bioethics Law and Assisted Procreation Rights Topic 247: Protests Over Coronavirus Vaccine Availability and Distribution Topic 248: Moroccan Protest Supporting Palestinian Women and Opposing Israeli Actions Topic 249: Enforcement of Hijab by Iranian Morality Police and Sociopolitical Impact Topic 250: Women's Protest Against Abortion Restrictions Topic 251: Protests Against Hijab Rules and Mahsa Amini's Death Topic 252: Protests Against Insecurity and Fulani Pastoralists in Akoko Region Topic 253: Protests Against Femicide by Women's Groups
In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Convert only necessary columns to pandas, preserving order
df_plot = pl_df_clean.select(["country", "notes_clean"]).drop_nulls().to_pandas()
# Ensure the length matches the number of topics
assert len(df_plot) == len(topics), "❌ 'topics' length does not match the cleaned documents."
# Assign topics to cleaned documents
df_doc_topics = pd.DataFrame({
"country": df_plot["country"].values,
"topic": topics
})
# Count topics per country
topic_counts = df_doc_topics.groupby(["country", "topic"]).size().reset_index(name="count")
# Relative percentage per country
topic_counts["total_country"] = topic_counts.groupby("country")["count"].transform("sum")
topic_counts["percentage"] = 100 * topic_counts["count"] / topic_counts["total_country"]
# Get topic labels
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
labels_df = pd.DataFrame([
{"topic": topic_id, "label": label_info[0][0].split("\n")[0]}
for topic_id, label_info in openai_topics.items()
])
# Merge with labels
topic_counts["topic"] = topic_counts["topic"].astype(int)
labels_df["topic"] = labels_df["topic"].astype(int)
topic_counts = topic_counts.merge(labels_df, on="topic", how="left")
# Top-N topics per country
top_n = 5
top_topics_per_country = topic_counts.sort_values(["country", "percentage"], ascending=[True, False])\
.groupby("country").head(top_n)
# Limit to only 3 countries
countries_to_plot = top_topics_per_country["country"].drop_duplicates().sort_values().head(3)
top_topics_per_country = top_topics_per_country[top_topics_per_country["country"].isin(countries_to_plot)]
# Plot with Seaborn
g = sns.FacetGrid(
top_topics_per_country,
col="country",
col_wrap=3,
sharey=False,
height=4,
aspect=1.5
)
g.map_dataframe(
sns.barplot,
x="percentage",
y="label",
palette="tab10"
)
g.set_titles(col_template="{col_name}")
g.set_axis_labels("Percentage (%)", "Topic")
for ax in g.axes.flatten():
for label in ax.get_yticklabels():
label.set_rotation(0)
plt.tight_layout()
plt.show()
In [48]:
openai_labels = [
label[0][0].split("\n")[0]
for label in topic_model.get_topics(full=True)["OpenAI"].values()
]
topic_model.set_topic_labels(openai_labels)
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Convert required columns from Polars
df_plot = pl_df_clean.select(["event_date", "notes_clean"]).drop_nulls().to_pandas()
df_plot["event_date"] = pd.to_datetime(df_plot["event_date"])
# Validate topic length
assert len(df_plot) == len(topics), "❌ Length of 'topics' does not match cleaned documents."
# Create DataFrame of topics per document
df_doc_topics = pd.DataFrame({
"event_date": df_plot["event_date"].values,
"topic": topics
})
# Add month column
df_doc_topics["month"] = df_doc_topics["event_date"].dt.to_period("M").dt.to_timestamp()
# Group by month and topic
monthly_topic_counts = df_doc_topics.groupby(["month", "topic"]).size().reset_index(name="count")
# Calculate percentage within each month
monthly_topic_counts["monthly_total"] = monthly_topic_counts.groupby("month")["count"].transform("sum")
monthly_topic_counts["percentage"] = 100 * monthly_topic_counts["count"] / monthly_topic_counts["monthly_total"]
# Get topic labels from the model
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
labels_df = pd.DataFrame([
{"topic": topic_id, "label": label_info[0][0].split("\n")[0]}
for topic_id, label_info in openai_topics.items()
])
labels_df["topic"] = labels_df["topic"].astype(int)
monthly_topic_counts["topic"] = monthly_topic_counts["topic"].astype(int)
monthly_topic_counts = monthly_topic_counts.merge(labels_df, on="topic", how="left")
# Select top-N global topics by volume
top_n = 5
top_topics = (
monthly_topic_counts.groupby("topic")["count"]
.sum()
.nlargest(top_n)
.index.tolist()
)
df_top = monthly_topic_counts[monthly_topic_counts["topic"].isin(top_topics)]
# Monthly percentage line plot
plt.figure(figsize=(12, 6))
ax = sns.lineplot(data=df_top, x="month", y="percentage", hue="label", marker="o")
plt.legend(title="Topic (OpenAI)", bbox_to_anchor=(1.02, 1), loc="upper left", borderaxespad=0)
plt.title(f"Monthly trend (% of total) of the top {top_n} most frequent topics")
plt.xlabel("Month")
plt.ylabel("Percentage of documents (%)")
plt.xticks(rotation=45)
plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show()
In [54]:
import itertools
import pandas as pd
# Define colors for the visualization to iterate over
colors = itertools.cycle(['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'])
color_key = {str(topic): next(colors) for topic in set(topic_model.topics_) if topic != -1}
# Prepare dataframe and ignore outliers
dfo = pd.DataFrame({"x": reduced_embeddings[:, 0], "y": reduced_embeddings[:, 1], "Topic": [str(t) for t in topic_model.topics_]})
dfo["Length"] = [len(doc) for doc in documents]
dfo = dfo.loc[dfo.Topic != "-1"]
dfo = dfo.loc[(dfo.y > -10) & (dfo.y < 10) & (dfo.x < 10) & (dfo.x > -10), :]
dfo["Topic"] = dfo["Topic"].astype("category")
# Get centroids of clusters
mean_df = dfo.groupby("Topic").mean().reset_index()
mean_df.Topic = mean_df.Topic.astype(int)
mean_df = mean_df.sort_values("Topic")
In [56]:
import seaborn as sns
from matplotlib import pyplot as plt
from adjustText import adjust_text
import matplotlib.patheffects as pe
import textwrap
fig = plt.figure(figsize=(20, 20))
sns.scatterplot(
data=dfo,
x='x',
y='y',
hue='Topic',
palette=color_key,
alpha=0.4,
size='Length',
sizes=(10, 200),
legend=False
)
# Annotate top 50 topics
texts, xs, ys = [], [], []
for row in mean_df.iterrows():
topic = row[1]["Topic"]
name = textwrap.fill(topic_model.custom_labels_[int(topic)], 20)
if int(topic) <= 50:
xs.append(row[1]["x"])
ys.append(row[1]["y"])
texts.append(plt.text(row[1]["x"], row[1]["y"], name, size=10, ha="center", color=color_key[str(int(topic))],
path_effects=[pe.withStroke(linewidth=0.5, foreground="black")]
))
# Adjust annotations such that they do not overlap
adjust_text(texts, x=xs, y=ys, time_lim=1, force_text=(0.01, 0.02), force_static=(0.01, 0.02), force_pull=(0.5, 0.5))
plt.axis('off')
plt.legend('', frameon=False)
plt.show()
In [58]:
# Extract required data
df_plot = pl_df_clean.select(["event_date", "country", "notes_clean"]).drop_nulls().to_pandas()
df_plot["event_date"] = pd.to_datetime(df_plot["event_date"])
# Validate length
assert len(df_plot) == len(topics), "❌ Length of 'topics' does not match the documents."
# Create base DataFrame
df_doc_topics = pd.DataFrame({
"event_date": df_plot["event_date"].values,
"country": df_plot["country"].values,
"topic": topics
})
# Add month column
df_doc_topics["month"] = df_doc_topics["event_date"].dt.to_period("M").dt.to_timestamp()
# Group by month, country, and topic
grouped = df_doc_topics.groupby(["month", "country", "topic"]).size().reset_index(name="count")
# Calculate % per month and country
grouped["total"] = grouped.groupby(["month", "country"])["count"].transform("sum")
grouped["percentage"] = 100 * grouped["count"] / grouped["total"]
# Add OpenAI topic labels
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
labels_df = pd.DataFrame([
{"topic": topic_id, "label": label_info[0][0].split("\n")[0]}
for topic_id, label_info in openai_topics.items()
])
labels_df["topic"] = labels_df["topic"].astype(int)
grouped["topic"] = grouped["topic"].astype(int)
grouped = grouped.merge(labels_df, on="topic", how="left")
# Filter top-N agendas per country and month
top_n = 3
top_agendas = grouped.sort_values(["month", "country", "percentage"], ascending=[True, True, False])\
.groupby(["month", "country"]).head(top_n)
top_agendas.head()
Out[58]:
| month | country | topic | count | total | percentage | label | |
|---|---|---|---|---|---|---|---|
| 0 | 2018-01-01 | Afghanistan | -1 | 1 | 3 | 33.333333 | Protests and Violence Against Women |
| 1 | 2018-01-01 | Afghanistan | 32 | 1 | 3 | 33.333333 | Taliban's Violations and Abuse Against Women |
| 2 | 2018-01-01 | Afghanistan | 77 | 1 | 3 | 33.333333 | Unidentified Armed Group Attacks on Women |
| 3 | 2018-01-01 | Albania | -1 | 1 | 1 | 100.000000 | Protests and Violence Against Women |
| 4 | 2018-01-01 | Argentina | -1 | 4 | 5 | 80.000000 | Protests and Violence Against Women |
In [60]:
# Ensure groupe' is created
april_2025 = grouped[grouped["month"] == "2025-04"]
# Sort by country and descending percentage
april_2025 = april_2025.sort_values(["country", "percentage"], ascending=[True, False])
# View top-N agendas per country (e.g., 3 per country)
top_n = 3
april_top = april_2025.groupby("country").head(top_n)
april_top[["country", "topic", "label", "percentage"]]
Out[60]:
| country | topic | label | percentage | |
|---|---|---|---|---|
| 18762 | Algeria | -1 | Protests and Violence Against Women | 50.000000 |
| 18763 | Algeria | 87 | Women's Protests for Political Prisoners' Imme... | 50.000000 |
| 18764 | Angola | 78 | Protests Against Farm Laws by SKM and Women Fa... | 100.000000 |
| 18765 | Argentina | 3 | Protests for Justice in Femicide Cases | 100.000000 |
| 18766 | Australia | 41 | Extinction Rebellion and Climate Action Protests | 100.000000 |
| ... | ... | ... | ... | ... |
| 18938 | Uzbekistan | 78 | Protests Against Farm Laws by SKM and Women Fa... | 100.000000 |
| 18939 | Venezuela | -1 | Protests and Violence Against Women | 100.000000 |
| 18945 | Yemen | 186 | Houthi-Sponsored Protests in Solidarity with P... | 38.095238 |
| 18940 | Yemen | -1 | Protests and Violence Against Women | 28.571429 |
| 18944 | Yemen | 161 | Houthi-Sponsored Protest in Solidarity with Pa... | 14.285714 |
145 rows × 4 columns
In [62]:
april_top.to_csv("Output/Agendas_April25.csv", index=False)
In [64]:
# Monthly Classification
In [66]:
!pip install sentence-transformers
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Requirement already satisfied: sentence-transformers in /opt/anaconda3/lib/python3.11/site-packages (5.0.0) Requirement already satisfied: transformers<5.0.0,>=4.41.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.53.2) Requirement already satisfied: tqdm in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.65.0) Requirement already satisfied: torch>=1.11.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (2.2.2) Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.6.1) Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.11.4) Requirement already satisfied: huggingface-hub>=0.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (0.33.4) Requirement already satisfied: Pillow in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (10.2.0) Requirement already satisfied: typing_extensions>=4.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.14.0) Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (3.13.1) Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2023.6.0) Requirement already satisfied: packaging>=20.9 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (23.1) Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (6.0.1) Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2.32.3) Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (1.1.5) Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (1.12) Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1) Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1.3) Requirement already satisfied: numpy>=1.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (1.26.4) Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (2023.10.3) Requirement already satisfied: tokenizers<0.22,>=0.21 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.21.2) Requirement already satisfied: safetensors>=0.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.5.2) Requirement already satisfied: joblib>=1.2.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (1.2.0) Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (3.5.0) Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=1.11.0->sentence-transformers) (2.1.3) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.10) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.7) Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2025.4.26) Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=1.11.0->sentence-transformers) (1.3.0)
In [89]:
import pandas as pd
# Load the original file
df_women = pd.read_csv("Input/Jun25.csv")
# Apply the topic model to the 'notes' column
documents = df_women["notes"].astype(str).tolist()
topics, probs = topic_model.transform(documents)
# Add topic results to the original DataFrame
df_women["topic"] = topics
df_women["probability"] = probs
# Get and clean topic labels
raw_labels = topic_model.get_topic_info()[["Topic", "Name"]].set_index("Topic")["Name"].to_dict()
clean_labels = {
topic_id: (
label.split("_", 1)[1] if topic_id != -1 and "_" in label else "Unassigned"
)
for topic_id, label in raw_labels.items()
}
df_women["topic_label"] = df_women["topic"].map(clean_labels)
Batches: 0%| | 0/403 [00:00<?, ?it/s]
2025-07-14 18:32:25,256 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings. 2025-07-14 18:32:35,652 - BERTopic - Dimensionality - Completed ✓ 2025-07-14 18:32:35,653 - BERTopic - Clustering - Approximating new points with `hdbscan_model` 2025-07-14 18:32:36,337 - BERTopic - Cluster - Completed ✓
In [91]:
# Filter out rows with topic = -1 (unassigned)
df_women_filtered = df_women[df_women["topic"] != -1].copy()
# Check a sample
print(df_women_filtered[["country", "topic", "topic_label", "probability"]].sample(5))
country topic topic_label probability 7323 United States 20 trump_musk_elon_donald 0.802337 9384 Sudan 4 amini_mahsa_rioters_death 0.645559 10256 Iran 85 veterinary_doctor_murder_awarded 0.347326 292 Nepal 8 anganwadi_workers_helpers_assistants 0.452108 4033 United States 21 transgender_lgbtq_pride_trans 1.000000
In [93]:
# Save cleaned file
df_women_filtered[["country", "topic", "topic_label", "probability"]].to_csv("Output/Jun25.csv", index=False)